This guide is suitable for all beginner who want to learn the use case of various library like Numpy, Pandas, Seaborn, Plotly, Matplot and Scikit-learn in Jupyter Notebook. Each library instantiate with examples like its basic usage and then taking to more advanced examples. Each example is attached with suitable comment; thus reader can know everything about it without explicitly googling it.
import numpy as np # For working with arrays and doing mathematical operations
import pandas as pd # To analyze data, compatible with reading csv, excel, json etc
import seaborn as sns # For making statistical graphics
import plotly.express as px # For data visualization and understanding data
import warnings # To ignore various waring, due to depracated packages like is_categorical_dtype
import matplotlib.pyplot as plt; # Creating static, animated, and interactive visualizations
plt.style.use('ggplot') # The list of styles available in package, choosing ggplot in our case
#Scikit-learn ( sklearn ) used for predictive data analysis, implementing machine learning models and statistical modelling .
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
# Creating plot using subplots with parameters denoting number of rows and coloums : (2,3) in below example.
# figsize with two parameter i.e. width and height in inches.
# plt.show for showing the graph.
figure, axes = plt.subplots(2, 3, figsize=(8, 5))
plt.show()
# If graph ticks are very close, you can use plt.tight_layout() to add padding between subplots.
figure, axes = plt.subplots(2, 3, figsize=(8, 5))
plt.tight_layout()
plt.show()
# A simple way to create an numpy array in python
x = np.array([0, 1, 2, 3, 4, 5])
# A similar approach as above instead manually doing, giving the number in arange method creates same as above.
y = np.arange(6)
x,y
(array([0, 1, 2, 3, 4, 5]), array([0, 1, 2, 3, 4, 5]))
# To perform various mathematical operations, numpy can do this on array on the fly.
mean = np.mean(x) # Mean of all the values in array.
std = np.std(x) # Standard deviation of all the values in array.
x = x[3:] # Slicing an array just like we do with list in python.
y = y[1:2]
z = x + y
concat = np.concatenate((x, y)) # Merging two array.
sort = np.sort(concat) # Sorting the value of array using numpy sort method.
x,y,z,concat,sort
(array([3, 4, 5]), array([1]), array([4, 5, 6]), array([3, 4, 5, 1]), array([1, 3, 4, 5]))
iris = sns.load_dataset("iris") # Loding preloaded dataset that comes along with seaborn alising as sns.
iris.head(10) # head method to show data, with parameter indicating number of the rows to be shown.
| sepal_length | sepal_width | petal_length | petal_width | species | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
| 5 | 5.4 | 3.9 | 1.7 | 0.4 | setosa |
| 6 | 4.6 | 3.4 | 1.4 | 0.3 | setosa |
| 7 | 5.0 | 3.4 | 1.5 | 0.2 | setosa |
| 8 | 4.4 | 2.9 | 1.4 | 0.2 | setosa |
| 9 | 4.9 | 3.1 | 1.5 | 0.1 | setosa |
# Using scatterplot method of seaborn with parameters mapping to coloums of the given data in data parameter.
sns.scatterplot(x="sepal_length", y="sepal_width", data=iris)
# Displaying the graph
plt.show()
np.random.seed(0) # The random numbers predictable, especially used for debugging purposes
a = np.random.randn(1000) # Creating 1000 random values on the x-axis.
b = 4 * a + np.random.randn(1000) # Adding some extra noise to create a linear relationship.
# Create a scatter plot using Seaborn
sns.set(style="whitegrid") # Setting the style type, explore documentation to learn more.
sns.scatterplot(x=a, y=b) # Creating a plot showing relationship between datapoints along x and y axis.
# Adding labels along x, y axis and adding title using the below methods.
plt.xlabel("X-axis-label")
plt.ylabel("Y-axis-label")
plt.title("Plot using Seaborn and Numpy")
# This will show the relationship between datapoints of a and b by plotting their values on a graph.
plt.show()
# Reading the spotify data with parameters as filename and encoding type.
# Source Reference: https://www.kaggle.com/datasets/nelgiriyewithana/top-spotify-songs-2023/code
dataframe = pd.read_csv('spotify-2023.csv', encoding='ISO-8859-1') # ISO 8859-1 is a single-byte encoding
print('Spotify Data Set Size {}'.format(dataframe.shape)) # Python print syntax to get size of our dataframe.
dataframe.head() # To view the data using head which by default return first 5 rows.
Spotify Data Set Size (953, 24)
| track_name | artist(s)_name | artist_count | released_year | released_month | released_day | in_spotify_playlists | in_spotify_charts | streams | in_apple_playlists | ... | bpm | key | mode | danceability_% | valence_% | energy_% | acousticness_% | instrumentalness_% | liveness_% | speechiness_% | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Seven (feat. Latto) (Explicit Ver.) | Latto, Jung Kook | 2 | 2023 | 7 | 14 | 553 | 147 | 141381703 | 43 | ... | 125 | B | Major | 80 | 89 | 83 | 31 | 0 | 8 | 4 |
| 1 | LALA | Myke Towers | 1 | 2023 | 3 | 23 | 1474 | 48 | 133716286 | 48 | ... | 92 | C# | Major | 71 | 61 | 74 | 7 | 0 | 10 | 4 |
| 2 | vampire | Olivia Rodrigo | 1 | 2023 | 6 | 30 | 1397 | 113 | 140003974 | 94 | ... | 138 | F | Major | 51 | 32 | 53 | 17 | 0 | 31 | 6 |
| 3 | Cruel Summer | Taylor Swift | 1 | 2019 | 8 | 23 | 7858 | 100 | 800840817 | 116 | ... | 170 | A | Major | 55 | 58 | 72 | 11 | 0 | 11 | 15 |
| 4 | WHERE SHE GOES | Bad Bunny | 1 | 2023 | 5 | 18 | 3133 | 50 | 303236322 | 84 | ... | 144 | A | Minor | 65 | 23 | 80 | 14 | 63 | 11 | 6 |
5 rows × 24 columns
# Info method on dataframe created above helps us to get the information inside the dataframe
dataframe.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 953 entries, 0 to 952 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 track_name 953 non-null object 1 artist(s)_name 953 non-null object 2 artist_count 953 non-null int64 3 released_year 953 non-null int64 4 released_month 953 non-null int64 5 released_day 953 non-null int64 6 in_spotify_playlists 953 non-null int64 7 in_spotify_charts 953 non-null int64 8 streams 953 non-null object 9 in_apple_playlists 953 non-null int64 10 in_apple_charts 953 non-null int64 11 in_deezer_playlists 953 non-null object 12 in_deezer_charts 953 non-null int64 13 in_shazam_charts 903 non-null object 14 bpm 953 non-null int64 15 key 858 non-null object 16 mode 953 non-null object 17 danceability_% 953 non-null int64 18 valence_% 953 non-null int64 19 energy_% 953 non-null int64 20 acousticness_% 953 non-null int64 21 instrumentalness_% 953 non-null int64 22 liveness_% 953 non-null int64 23 speechiness_% 953 non-null int64 dtypes: int64(17), object(7) memory usage: 178.8+ KB
# describe method gives us the descriptive statistics of the dataframe
dataframe.describe()
| artist_count | released_year | released_month | released_day | in_spotify_playlists | in_spotify_charts | in_apple_playlists | in_apple_charts | in_deezer_charts | bpm | danceability_% | valence_% | energy_% | acousticness_% | instrumentalness_% | liveness_% | speechiness_% | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.00000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 | 953.000000 |
| mean | 1.556139 | 2018.238195 | 6.033578 | 13.930745 | 5200.124869 | 12.009444 | 67.812172 | 51.908709 | 2.666317 | 122.540399 | 66.96957 | 51.431270 | 64.279119 | 27.057712 | 1.581322 | 18.213012 | 10.131165 |
| std | 0.893044 | 11.116218 | 3.566435 | 9.201949 | 7897.608990 | 19.575992 | 86.441493 | 50.630241 | 6.035599 | 28.057802 | 14.63061 | 23.480632 | 16.550526 | 25.996077 | 8.409800 | 13.711223 | 9.912888 |
| min | 1.000000 | 1930.000000 | 1.000000 | 1.000000 | 31.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 65.000000 | 23.00000 | 4.000000 | 9.000000 | 0.000000 | 0.000000 | 3.000000 | 2.000000 |
| 25% | 1.000000 | 2020.000000 | 3.000000 | 6.000000 | 875.000000 | 0.000000 | 13.000000 | 7.000000 | 0.000000 | 100.000000 | 57.00000 | 32.000000 | 53.000000 | 6.000000 | 0.000000 | 10.000000 | 4.000000 |
| 50% | 1.000000 | 2022.000000 | 6.000000 | 13.000000 | 2224.000000 | 3.000000 | 34.000000 | 38.000000 | 0.000000 | 121.000000 | 69.00000 | 51.000000 | 66.000000 | 18.000000 | 0.000000 | 12.000000 | 6.000000 |
| 75% | 2.000000 | 2022.000000 | 9.000000 | 22.000000 | 5542.000000 | 16.000000 | 88.000000 | 87.000000 | 2.000000 | 140.000000 | 78.00000 | 70.000000 | 77.000000 | 43.000000 | 0.000000 | 24.000000 | 11.000000 |
| max | 8.000000 | 2023.000000 | 12.000000 | 31.000000 | 52898.000000 | 147.000000 | 672.000000 | 275.000000 | 58.000000 | 206.000000 | 96.00000 | 97.000000 | 97.000000 | 97.000000 | 91.000000 | 97.000000 | 64.000000 |
# Creating list of coloums to drop and passing it as a parameter in drop method on dataframe.
drop_columns = ['liveness_%', 'instrumentalness_%', 'mode', 'key','valence_%']
dataframe = dataframe.drop(columns=drop_columns)
dataframe.head(10)
| track_name | artist(s)_name | artist_count | released_year | released_month | released_day | in_spotify_playlists | in_spotify_charts | streams | in_apple_playlists | in_apple_charts | in_deezer_playlists | in_deezer_charts | in_shazam_charts | bpm | danceability_% | energy_% | acousticness_% | speechiness_% | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Seven (feat. Latto) (Explicit Ver.) | Latto, Jung Kook | 2 | 2023 | 7 | 14 | 553 | 147 | 141381703 | 43 | 263 | 45 | 10 | 826 | 125 | 80 | 83 | 31 | 4 |
| 1 | LALA | Myke Towers | 1 | 2023 | 3 | 23 | 1474 | 48 | 133716286 | 48 | 126 | 58 | 14 | 382 | 92 | 71 | 74 | 7 | 4 |
| 2 | vampire | Olivia Rodrigo | 1 | 2023 | 6 | 30 | 1397 | 113 | 140003974 | 94 | 207 | 91 | 14 | 949 | 138 | 51 | 53 | 17 | 6 |
| 3 | Cruel Summer | Taylor Swift | 1 | 2019 | 8 | 23 | 7858 | 100 | 800840817 | 116 | 207 | 125 | 12 | 548 | 170 | 55 | 72 | 11 | 15 |
| 4 | WHERE SHE GOES | Bad Bunny | 1 | 2023 | 5 | 18 | 3133 | 50 | 303236322 | 84 | 133 | 87 | 15 | 425 | 144 | 65 | 80 | 14 | 6 |
| 5 | Sprinter | Dave, Central Cee | 2 | 2023 | 6 | 1 | 2186 | 91 | 183706234 | 67 | 213 | 88 | 17 | 946 | 141 | 92 | 58 | 19 | 24 |
| 6 | Ella Baila Sola | Eslabon Armado, Peso Pluma | 2 | 2023 | 3 | 16 | 3090 | 50 | 725980112 | 34 | 222 | 43 | 13 | 418 | 148 | 67 | 76 | 48 | 3 |
| 7 | Columbia | Quevedo | 1 | 2023 | 7 | 7 | 714 | 43 | 58149378 | 25 | 89 | 30 | 13 | 194 | 100 | 67 | 71 | 37 | 4 |
| 8 | fukumean | Gunna | 1 | 2023 | 5 | 15 | 1096 | 83 | 95217315 | 60 | 210 | 48 | 11 | 953 | 130 | 85 | 62 | 12 | 9 |
| 9 | La Bebe - Remix | Peso Pluma, Yng Lvcas | 2 | 2023 | 3 | 17 | 2953 | 44 | 553634067 | 49 | 110 | 66 | 13 | 339 | 170 | 81 | 48 | 21 | 33 |
# If you want to get only speficied coloum like 'artist(s)_name' from the dataframe
dataframe['artist(s)_name']
0 Latto, Jung Kook
1 Myke Towers
2 Olivia Rodrigo
3 Taylor Swift
4 Bad Bunny
...
948 Selena Gomez
949 Taylor Swift
950 Feid, Paulo Londra
951 Feid, Sech, Jhayco
952 Burna Boy
Name: artist(s)_name, Length: 953, dtype: object
# Reading specific rows of the 'artist(s)_name' column
dataframe['artist(s)_name'][0:5]
0 Latto, Jung Kook 1 Myke Towers 2 Olivia Rodrigo 3 Taylor Swift 4 Bad Bunny Name: artist(s)_name, dtype: object
# To read the specific row, by giving a specific row number into iloc method
dataframe.iloc[0]
track_name Seven (feat. Latto) (Explicit Ver.) artist(s)_name Latto, Jung Kook artist_count 2 released_year 2023 released_month 7 released_day 14 in_spotify_playlists 553 in_spotify_charts 147 streams 141381703 in_apple_playlists 43 in_apple_charts 263 in_deezer_playlists 45 in_deezer_charts 10 in_shazam_charts 826 bpm 125 danceability_% 80 energy_% 83 acousticness_% 31 speechiness_% 4 Name: 0, dtype: object
# Similarly if you want to read a specific row and coloum value of cell by specifying rows and coloum in iloc method
dataframe.iloc[2, 1]
'Olivia Rodrigo'
# To sort the series of the dataframe according to specific columns
dataframe.sort_values(["released_year"], ascending = False)
| track_name | artist(s)_name | artist_count | released_year | released_month | released_day | in_spotify_playlists | in_spotify_charts | streams | in_apple_playlists | in_apple_charts | in_deezer_playlists | in_deezer_charts | in_shazam_charts | bpm | danceability_% | energy_% | acousticness_% | speechiness_% | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Seven (feat. Latto) (Explicit Ver.) | Latto, Jung Kook | 2 | 2023 | 7 | 14 | 553 | 147 | 141381703 | 43 | 263 | 45 | 10 | 826 | 125 | 80 | 83 | 31 | 4 |
| 210 | Abcdario | Junior H, Eden Mu�ï | 2 | 2023 | 5 | 13 | 262 | 5 | 89933133 | 8 | 60 | 4 | 1 | 109 | 129 | 70 | 43 | 78 | 3 |
| 351 | PERO Tï¿ | Karol G, Quevedo | 2 | 2023 | 2 | 23 | 387 | 11 | 93438910 | 11 | 15 | 14 | 3 | 1 | 140 | 86 | 79 | 39 | 29 |
| 174 | ýýýýýýýýýýýý | YOASOBI | 1 | 2023 | 4 | 12 | 356 | 16 | 143573775 | 35 | 102 | 8 | 1 | 117 | 166 | 57 | 94 | 11 | 9 |
| 348 | Red Ruby Da Sleeze | Nicki Minaj | 1 | 2023 | 3 | 3 | 1168 | 0 | 81419389 | 45 | 11 | 20 | 0 | 21 | 98 | 70 | 73 | 12 | 26 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 459 | A Holly Jolly Christmas - Single Version | Burl Ives | 1 | 1952 | 1 | 1 | 7930 | 0 | 395591396 | 108 | 120 | 73 | 0 | 0 | 140 | 67 | 36 | 64 | 3 |
| 466 | Let It Snow! Let It Snow! Let It Snow! | Frank Sinatra, B. Swanson Quartet | 2 | 1950 | 1 | 1 | 10585 | 0 | 473248298 | 126 | 108 | 406 | 0 | 0 | 143 | 60 | 32 | 88 | 6 |
| 460 | The Christmas Song (Merry Christmas To You) - ... | Nat King Cole | 1 | 1946 | 11 | 1 | 11500 | 0 | 389771964 | 140 | 72 | 251 | 0 | 0 | 139 | 36 | 15 | 84 | 4 |
| 469 | White Christmas | Bing Crosby, John Scott Trotter & His Orchestr... | 3 | 1942 | 1 | 1 | 11940 | 0 | 395591396 | 73 | 79 | 123 | 0 | 0 | 96 | 23 | 25 | 91 | 3 |
| 439 | Agudo M��gi | Styrx, utku INC, Thezth | 3 | 1930 | 1 | 1 | 323 | 0 | 90598517 | 4 | 0 | 14 | 0 | 0 | 130 | 65 | 80 | 22 | 5 |
953 rows × 19 columns
# To find the unique values of specific column of a series
dataframe["released_year"].unique()
array([2023, 2019, 2022, 2013, 2014, 2018, 2017, 2020, 2016, 2012, 1999,
2008, 1975, 2021, 2015, 2011, 2004, 1985, 2007, 2002, 2010, 1983,
1992, 1968, 1984, 2000, 1997, 1995, 2003, 1973, 1930, 1994, 1958,
1957, 1963, 1959, 1970, 1971, 1952, 1946, 1979, 1950, 1942, 1986,
2005, 1991, 1996, 1998, 1982, 1987], dtype=int64)
# Converting datatype using astype(datatype) and combining day, month, year into a new coloum into new format 'release_date'
# You can split the code into multiple line using \ character
dataframe['release_date'] = pd.to_datetime(dataframe['released_year'].astype(str) \
+ '-' + dataframe['released_month'].astype(str) \
+ '-' + dataframe['released_day'].astype(str), format='%Y-%m-%d', errors='coerce')
# Same as a previous example to drop the coloums
drop_columns = ['released_year', 'released_month', 'released_day']
dataframe = dataframe.drop(columns=drop_columns)
dataframe.head()
| track_name | artist(s)_name | artist_count | in_spotify_playlists | in_spotify_charts | streams | in_apple_playlists | in_apple_charts | in_deezer_playlists | in_deezer_charts | in_shazam_charts | bpm | danceability_% | energy_% | acousticness_% | speechiness_% | release_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Seven (feat. Latto) (Explicit Ver.) | Latto, Jung Kook | 2 | 553 | 147 | 141381703 | 43 | 263 | 45 | 10 | 826 | 125 | 80 | 83 | 31 | 4 | 2023-07-14 |
| 1 | LALA | Myke Towers | 1 | 1474 | 48 | 133716286 | 48 | 126 | 58 | 14 | 382 | 92 | 71 | 74 | 7 | 4 | 2023-03-23 |
| 2 | vampire | Olivia Rodrigo | 1 | 1397 | 113 | 140003974 | 94 | 207 | 91 | 14 | 949 | 138 | 51 | 53 | 17 | 6 | 2023-06-30 |
| 3 | Cruel Summer | Taylor Swift | 1 | 7858 | 100 | 800840817 | 116 | 207 | 125 | 12 | 548 | 170 | 55 | 72 | 11 | 15 | 2019-08-23 |
| 4 | WHERE SHE GOES | Bad Bunny | 1 | 3133 | 50 | 303236322 | 84 | 133 | 87 | 15 | 425 | 144 | 65 | 80 | 14 | 6 | 2023-05-18 |
# To ignore the warnings due to deprecated packages
warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")
# Getting the top 10 Singer and making the bar plot graph from the data.
top_singer = dataframe['artist(s)_name'].value_counts().head(20)
plt.figure(figsize=(12, 6))
sns.barplot(x=top_singer.values, y=top_singer.index, palette='crest')
plt.xlabel('Total Song')
plt.ylabel('Singer Name')
plt.title('Top 20 Singer with most number of songs')
plt.show()
# BarPlot indicates that Taylor Swift has almost 35 songs, after that singer at 2-5 are almost equal number of songs.
# Getting coloum of bpm and danceability
beat_per_minute = dataframe['bpm']
dance_ability = dataframe['danceability_%']
# Creating a scatter plot using Seaborn
plt.figure(figsize=(8, 6))
sns.scatterplot(x=beat_per_minute, y=dance_ability, color='green', marker='*')
plt.xlabel('Beat per minute ( BPM )')
plt.ylabel('Danceability')
plt.title('Scatter Plot : BPM vs. Danceability')
# Show the plot
plt.grid(True)
plt.show()
# Analysing the plot it seems there is no correlation between bpm and danceability
# Getting top 10 songs based on the value of streams of spotify data.
# Column streams have weird value at index 574, so we need to clean the data before processing it.
# Taking reference of dataframe to a variable data
data = dataframe
inconsistent_value = 'BPM110KeyAModeMajorDanceability53Valence75Energy69Acousticness7Instrumentalness0Liveness17Speechiness3'
stream_mean_value = pd.to_numeric(data[data['streams'] != inconsistent_value]['streams']).sort_values(ascending=False).mean()
data['streams'] = data['streams'].replace(inconsistent_value,stream_mean_value)
# Changing the datatype of the streams to numeric
data['streams'] = pd.to_numeric(data['streams'])
top_20_songs = data[['track_name', 'artist(s)_name', 'streams']].sort_values(by='streams', ascending=False).head(20)
plt.figure(figsize=(12, 6))
sns.barplot(x=top_20_songs['streams'], y=top_20_songs['track_name'], palette='flare')
plt.xlabel('Streams (in billions)')
plt.ylabel('Song Name')
plt.title('Top 20 Songs with Most Streams on Spotify')
plt.show()
top_20_songs
# Conclusion
# There are 2 song which are above 3+ billions and rest are above 2+ billions.
| track_name | artist(s)_name | streams | |
|---|---|---|---|
| 55 | Blinding Lights | The Weeknd | 3.703895e+09 |
| 179 | Shape of You | Ed Sheeran | 3.562544e+09 |
| 86 | Someone You Loved | Lewis Capaldi | 2.887242e+09 |
| 620 | Dance Monkey | Tones and I | 2.864792e+09 |
| 41 | Sunflower - Spider-Man: Into the Spider-Verse | Post Malone, Swae Lee | 2.808097e+09 |
| 162 | One Dance | Drake, WizKid, Kyla | 2.713922e+09 |
| 84 | STAY (with Justin Bieber) | Justin Bieber, The Kid Laroi | 2.665344e+09 |
| 140 | Believer | Imagine Dragons | 2.594040e+09 |
| 725 | Closer | The Chainsmokers, Halsey | 2.591224e+09 |
| 48 | Starboy | The Weeknd, Daft Punk | 2.565530e+09 |
| 138 | Perfect | Ed Sheeran | 2.559529e+09 |
| 71 | Heat Waves | Glass Animals | 2.557976e+09 |
| 14 | As It Was | Harry Styles | 2.513188e+09 |
| 693 | Se��o | Shawn Mendes, Camila Cabello | 2.484813e+09 |
| 324 | Say You Won't Let Go | James Arthur | 2.420461e+09 |
| 128 | lovely - Bonus Track | Billie Eilish, Khalid | 2.355720e+09 |
| 127 | Watermelon Sugar | Harry Styles | 2.322580e+09 |
| 407 | Don't Start Now | Dua Lipa | 2.303034e+09 |
| 621 | Lucid Dreams | Juice WRLD | 2.288695e+09 |
| 73 | Sweater Weather | The Neighbourhood | 2.282771e+09 |
# To group the datapoints by artist(s)_name and aggregating number of songs and total streams
# Give a dictionary with column name and a parameter like count,sum,min,max etc to do the operations
# reset_index method will reset the index of the dataframe and use the default one instead
dataframe.groupby('artist(s)_name').agg({'track_name': 'count', 'streams': 'sum'}).reset_index()
| artist(s)_name | track_name | streams | |
|---|---|---|---|
| 0 | (G)I-DLE | 2 | 2.669833e+08 |
| 1 | 21 Savage, Gunna | 1 | 6.068094e+07 |
| 2 | 24kgoldn, Iann Dior | 1 | 1.699402e+09 |
| 3 | 50 Cent | 1 | 1.202723e+09 |
| 4 | A$AP Rocky, Metro Boomin, Roisee | 1 | 9.418647e+07 |
| ... | ... | ... | ... |
| 640 | j-hope | 1 | 1.557958e+08 |
| 641 | j-hope, J. Cole | 1 | 1.165998e+08 |
| 642 | sped up 8282 | 1 | 1.037625e+08 |
| 643 | sped up nightcore, ARIZONATEARS, Lil Uzi Vert | 1 | 2.070333e+08 |
| 644 | teto | 1 | 1.391938e+08 |
645 rows × 3 columns
# What is the most streamed song of 2023?
# Sorting the datapoints by streams in descending order
stream_sort_by = dataframe.sort_values(by="streams", ascending=False)
# Getting top 30 songs from the sorted dataframe
top_30_songs = stream_sort_by.head(30)
# Graph Plotting
figure = px.bar(top_30_songs, x='track_name', y='streams',
title='Most streamed tracks in 2023',
color='track_name',
color_continuous_scale = 'viridis',
hover_name = 'artist(s)_name')
# Updating the layout by giving label to x and y axis , category order in descending order
figure.update_xaxes(categoryorder='total descending')
figure.update_xaxes(title_text='Song Names')
figure.update_yaxes(title_text='Total Streams')
# This is for adjusting the layout
figure.update_layout(width=1200, height=800)
plt.tight_layout()
figure.show()
<Figure size 640x480 with 0 Axes>
singer_statistics = data.groupby('artist(s)_name').agg({'track_name': 'count', 'streams': 'sum'}).reset_index()
singer_statistics.columns = ['Singer', 'Total Songs', 'Net Streams']
# Getting the average stream per songs of each singer and dividing it by a billion
singer_statistics['Streams Average per Song'] = (singer_statistics['Net Streams'] / singer_statistics['Total Songs']) / 1e9
# Sort the data by the number of songs in descending order
singer_statistics = singer_statistics.sort_values(by='Net Streams', ascending=False)
# Getting the top 20 artists
top_20_artists = singer_statistics.head(20)
# Creating an plotty bar chart for Net Streams
figure_one = px.bar(top_20_artists, x='Singer', y='Net Streams',
title='Most Streamed Singer in 2023',
color='Singer',
color_continuous_scale='rainbow')
# Creating an plotty bar chart for total songs sung by singer
figure_two = px.bar(top_20_artists, x='Singer', y='Total Songs',
title='Total Songs by Singer (Top 20)',
color='Singer',
color_continuous_scale='ice')
# Create an interactive bar chart for Average Streams per Song
figure_three = px.bar(top_20_artists, x='Singer', y='Streams Average per Song',
title='Average Streams per Song (Top 20)',
color='Singer',
color_continuous_scale='mint')
# Adjusting the layout
figure_one.update_xaxes(categoryorder='total descending')
figure_one.update_xaxes(title_text='Singer')
figure_one.update_yaxes(title_text='Total Streams')
figure_two.update_xaxes(categoryorder='total descending')
figure_two.update_xaxes(title_text='Singer')
figure_two.update_yaxes(title_text='Total Songs')
figure_three.update_xaxes(categoryorder='total descending')
figure_three.update_xaxes(title_text='Singer')
figure_three.update_yaxes(title_text='Streams Average per Song')
# Show the bar charts
figure_one.show()
figure_two.show()
figure_three.show()
# Below line is used to ignore warning due to deprecated packages
warnings.filterwarnings('ignore')
s_data = pd.read_csv('spotify-2023.csv', encoding='ISO-8859-1')
s_data.head()
# Creating new relevant information based upon the features coloum of individual songs
features_coloum = [ 'liveness_%', 'speechiness_%' ,'danceability_%', 'bpm',
'instrumentalness_%', 'valence_%', 'energy_%', 'acousticness_%', ]
s_data[features_coloum].head()
# Standardizes features by scaling each feature to a given range ( 0 to 1 in below example )
# Fit to data, then transform it.
mmscaler = MinMaxScaler()
standardize_features = mmscaler.fit_transform(s_data[features_coloum])
# Converting the standardize features again to dataframe to better handle the data
normalized_dataframe = pd.DataFrame(standardize_features, columns=features_coloum)
normalized_dataframe.head()
| liveness_% | speechiness_% | danceability_% | bpm | instrumentalness_% | valence_% | energy_% | acousticness_% | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.053191 | 0.032258 | 0.780822 | 0.425532 | 0.000000 | 0.913978 | 0.840909 | 0.319588 |
| 1 | 0.074468 | 0.032258 | 0.657534 | 0.191489 | 0.000000 | 0.612903 | 0.738636 | 0.072165 |
| 2 | 0.297872 | 0.064516 | 0.383562 | 0.517730 | 0.000000 | 0.301075 | 0.500000 | 0.175258 |
| 3 | 0.085106 | 0.209677 | 0.438356 | 0.744681 | 0.000000 | 0.580645 | 0.715909 | 0.113402 |
| 4 | 0.085106 | 0.064516 | 0.575342 | 0.560284 | 0.692308 | 0.204301 | 0.806818 | 0.144330 |
# Song Recomendation based upon the features
from sklearn.metrics.pairwise import cosine_similarity
# Now we can compute similarities
computed_matrix = cosine_similarity(normalized_dataframe)
# Transforming the calculated matrix into a DataFrame for easier handling and reading
computed_dataframe = pd.DataFrame(computed_matrix, index=s_data['track_name'], columns=s_data['track_name'])
computed_dataframe.head(10)
| track_name | Seven (feat. Latto) (Explicit Ver.) | LALA | vampire | Cruel Summer | WHERE SHE GOES | Sprinter | Ella Baila Sola | Columbia | fukumean | La Bebe - Remix | ... | Privileged Rappers | The Astronaut | BackOutsideBoyz | Broke Boys | The Great War | My Mind & Me | Bigger Than The Whole Sky | A Veces (feat. Feid) | En La De Ella | Alone |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| track_name | |||||||||||||||||||||
| Seven (feat. Latto) (Explicit Ver.) | 1.000000 | 0.975647 | 0.880344 | 0.913981 | 0.755991 | 0.936269 | 0.979482 | 0.910542 | 0.855850 | 0.865533 | ... | 0.921663 | 0.858991 | 0.796932 | 0.741030 | 0.981515 | 0.805591 | 0.528567 | 0.970908 | 0.974293 | 0.943181 |
| LALA | 0.975647 | 1.000000 | 0.848438 | 0.876134 | 0.761163 | 0.916185 | 0.917155 | 0.908580 | 0.876697 | 0.819718 | ... | 0.924463 | 0.890907 | 0.798221 | 0.775496 | 0.978372 | 0.707324 | 0.380635 | 0.984703 | 0.997566 | 0.970845 |
| vampire | 0.880344 | 0.848438 | 1.000000 | 0.947706 | 0.808084 | 0.877307 | 0.905297 | 0.879835 | 0.919387 | 0.880600 | ... | 0.881834 | 0.918433 | 0.875415 | 0.865885 | 0.873215 | 0.860803 | 0.685082 | 0.804157 | 0.850583 | 0.881730 |
| Cruel Summer | 0.913981 | 0.876134 | 0.947706 | 1.000000 | 0.800570 | 0.910577 | 0.931923 | 0.837759 | 0.855452 | 0.922431 | ... | 0.913622 | 0.914379 | 0.844334 | 0.835299 | 0.899534 | 0.808021 | 0.604323 | 0.850565 | 0.871277 | 0.871340 |
| WHERE SHE GOES | 0.755991 | 0.761163 | 0.808084 | 0.800570 | 1.000000 | 0.753580 | 0.754172 | 0.806952 | 0.811049 | 0.736370 | ... | 0.765912 | 0.847082 | 0.707696 | 0.764178 | 0.775037 | 0.717606 | 0.529490 | 0.702161 | 0.759296 | 0.812195 |
| Sprinter | 0.936269 | 0.916185 | 0.877307 | 0.910577 | 0.753580 | 1.000000 | 0.909120 | 0.871056 | 0.917768 | 0.973289 | ... | 0.988539 | 0.827115 | 0.935473 | 0.872006 | 0.889540 | 0.820281 | 0.558631 | 0.928006 | 0.933421 | 0.896686 |
| Ella Baila Sola | 0.979482 | 0.917155 | 0.905297 | 0.931923 | 0.754172 | 0.909120 | 1.000000 | 0.901235 | 0.824533 | 0.870439 | ... | 0.878414 | 0.839490 | 0.767688 | 0.711224 | 0.957101 | 0.877326 | 0.665652 | 0.905283 | 0.913442 | 0.901702 |
| Columbia | 0.910542 | 0.908580 | 0.879835 | 0.837759 | 0.806952 | 0.871056 | 0.901235 | 1.000000 | 0.919015 | 0.806597 | ... | 0.849602 | 0.900670 | 0.775121 | 0.816559 | 0.937927 | 0.872293 | 0.645065 | 0.847442 | 0.908928 | 0.973997 |
| fukumean | 0.855850 | 0.876697 | 0.919387 | 0.855452 | 0.811049 | 0.917768 | 0.824533 | 0.919015 | 1.000000 | 0.881745 | ... | 0.934555 | 0.910505 | 0.923804 | 0.931598 | 0.845892 | 0.822830 | 0.572939 | 0.837234 | 0.894846 | 0.924033 |
| La Bebe - Remix | 0.865533 | 0.819718 | 0.880600 | 0.922431 | 0.736370 | 0.973289 | 0.870439 | 0.806597 | 0.881745 | 1.000000 | ... | 0.955889 | 0.791919 | 0.944807 | 0.888221 | 0.810054 | 0.836975 | 0.642219 | 0.830181 | 0.838514 | 0.815200 |
10 rows × 953 columns
# Defining a function which will suggest songs based upon the song input and number of recommedations.
def songs_suggestions(original_song, recommendations=10):
# Look up the song in our dataframe.
if original_song not in computed_dataframe.index:
return "We are sorry, no more song suggestion at this moment"
# Getting the given song name similarlity features and sorting them to get highest similarity
song_identical_matrix = computed_dataframe[original_song].sort_values(ascending=False)
# Getting the highest similarity feature value of song excepting the song itself
suggestions = song_identical_matrix.iloc[1:recommendations+1].index.tolist()
return suggestions
# Passing the random song into our functions and getting the suggested song ouptut
song = "WHERE SHE GOES"
suggestion_output = songs_suggestions(song, 10)
suggestion_output
['After Dark', 'I Know - PR1SVX Edit', 'Poland', 'Apocalypse', 'B.O.T.A. (Baddest Of Them All) - Edit', 'METAMORPHOSIS', 'Makeba', 'Freaks', 'Rumble', 'Master of Puppets (Remastered)']